Data Visualization
Climate
Precipitation
Code
## read data and aggregate
df <- data.frame(matrix(ncol = 6, nrow = 0))
colnames(df) <- c("station","name","date","prcp","tmax","tmin")
for (i in 1:3){
file_name <- paste0("climate",i,".csv")
df_curr <- read.csv(paste0("./data/",file_name), header = T)
colnames(df_curr) <- c("station","name","date","prcp","tmax","tmin")
df <- rbind(df_curr,df)
}
## group by date: each day, we have one average prcp, tmax, and tmin based on all stations in Toledo
climate <- df %>%
group_by(date) %>%
summarise_at(vars(prcp,tmax,tmin), mean, na.rm = TRUE)
climate$date<-as.Date(climate$date,"%Y-%m-%d")
## sanity check
# any(duplicated(climate$date))
# min(climate$date);max(climate$date)
### climate for turbidity model, 2021-11-24 to 2022-12-31
# select data after 2021-11-24 and before 2022-12-31
climate_trub <- climate[(climate$date >= "2021-11-24") & (climate$date <= "2022-12-31"), ]
## interpolate the missing data using moving average
climate_trub$prcp <- na_ma(climate_trub$prcp, k = 4, weighting = "exponential")
climate_trub$tmax <- na_ma(climate_trub$tmax, k = 4, weighting = "exponential")
climate_trub$tmin <- na_ma(climate_trub$tmin, k = 4, weighting = "exponential")
## sanity check
#str(climate_trub)
#sum(is.na(climate_trub))
#min(climate_trub$date);max(climate_trub$date)
## prcp
prcp_trub_df <- data.frame(climate_trub$date,climate_trub$prcp)
colnames(prcp_trub_df) <- c("Date","Precipitation")
## prepare data for candlestick plot
prcp_summary <- prcp_trub_df %>%
group_by(Date) %>%
summarize(min = min(Precipitation),
q1 = quantile(Precipitation, 0.25),
median = median(Precipitation),
mean = mean(Precipitation),
q3 = quantile(Precipitation, 0.75),
max = max(Precipitation))
p2 <- plot_ly(prcp_summary,
x = ~Date,
y = ~mean,
type = 'scatter', mode = 'lines')
p2 <- p2 %>% layout(title = "Daily Precipitation Time Series In Study Area",
xaxis = list(rangeslider = list(visible = T)),
yaxis = list(title = 'Average precipitation (in)'))
p2Figure 1 shows the time series plot of the average precipitation (in liquid form) in the study area. The summer and fall are the season with high precipitation. However, this is not necessary indicate that winter and spring have less total precipitation, as the snow in the winter may also contribute to the total precipitation. There might be periodicity in the precipitation pattern, which will need more analysis to confirm.
Temperature
Figure 2 shows the daily maximum and minimum temperature in study area. The temperature has a generally increasing trend. Both maximum and minimum temperature have similar trends.
Code
## t
t_trub_df <- data.frame(climate_trub$date,climate_trub$tmax, climate_trub$tmin)
colnames(t_trub_df) <- c("Date","Tmax","Tmin")
p3 <- plot_ly(t_trub_df, type = 'scatter', mode = 'lines')%>%
add_trace(x = ~Date, y = ~Tmax, name = "Max")%>%
add_trace(x = ~Date, y = ~Tmin, name = "Min")%>%
layout(title='Max and Min Temperature for Study area',
xaxis = list(rangeslider = list(visible = T),
rangeselector=list(
buttons=list(
list(count=1, label="1m", step="month", stepmode="backward"),
list(count=6, label="6m", step="month", stepmode="backward"),
list(count=1, label="YTD", step="year", stepmode="todate"),
list(count=1, label="1y", step="year", stepmode="backward")
))))
p3 <- p3 %>%
layout(
xaxis = list(zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff'),
yaxis = list(zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff',
title = 'Temperature, F'),
plot_bgcolor='#e5ecf6', margin = 0.2, width = 900)
p3Water quality
Stream discharge
Code
## import and processing data
discharge <- read.csv("./data/discharge.csv")
discharge <- discharge[discharge$X.!= "#",]
discharge <- discharge[3:dim(discharge)[1],]
colnames(discharge) <- c("prefix", "station","date","discharge","flag")
discharge = discharge[c("prefix", "station","date","discharge","flag")]
discharge$date<-as.Date(discharge$date,"%m/%d/%y")
discharge$date <- as.Date(ifelse(discharge$date > Sys.Date(),
format(discharge$date, "19%y-%m-%d"),
format(discharge$date)))
# head(discharge)
# dim(discharge)
# str(discharge)
# select data after 1970
discharge = discharge[discharge$date >= "1970-01-01", ]
# head(discharge)
# dim(discharge)
# str(discharge)
## interpolate the missing data using moving average
library(imputeTS)
discharge$discharge <- na_ma(discharge$discharge, k = 4, weighting = "exponential")
## sanity check
# str(discharge)
# sum(is.na(discharge))Code
p <- plot_ly(discharge, type = 'scatter', mode = 'lines')%>%
add_trace(x = ~date, y = ~discharge, name="discharge")%>%
layout(title='Stream discharge at Waterville, OH station',
xaxis = list(rangeslider = list(visible = T),
rangeselector=list(
buttons=list(
list(count=1, label="1m", step="month", stepmode="backward"),
list(count=6, label="6m", step="month", stepmode="backward"),
list(count=1, label="1y", step="year", stepmode="backward"),
list(count=5, label="5y", step="year", stepmode="backward"),
list(count=16, label="YTD", step="year", stepmode="todate")
)
)))
p <- p %>%
layout(
xaxis = list(zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff',
title = "Date"),
yaxis = list(zerolinecolor = '#ffff',
zerolinewidth = 2,
gridcolor = 'ffff',
title = "Discharge, ft<sup>3</sup>/s"),
plot_bgcolor='#e5ecf6', margin = 0.2, width = 900)
pStream flow has clear periodicity according to Figure 3. There are some high value year, for example 2015. This is caused by the 2014–2016 El Niño event.
Turbidity
Code
## read data
turb <- read.csv("./data/turbidity.csv", header = F, comment.char = "#")
turb <- turb[3:nrow(turb),1:9]
colnames(turb) <- c("prefix", "station","date","max","flag1","min","flag2","mean","flag3")
turb$date<-as.Date(turb$date,"%m/%d/%y")
## extract daily max
turb_max<- turb[c("date","max")]
turb_max$date<-as.Date(turb_max$date,"%m/%d/%y")
turb_max$max = as.numeric(turb_max$max)
## interpolate the missing data using moving average
turb_max$max <- na_ma(turb_max$max, k = 4, weighting = "exponential")
# min(turb_max$date);max(turb_max$date)
## extract to 12/31/2022 as the climate max date
turb_max = turb_max[turb_max$date <= "2022-12-31", ]
# head(turb_max)
# min(turb_max$date);max(turb_max$date)
## sanity check
# str(turb_max)
# sum(is.na(turb_max))
turb_df <- data.frame(turb_max$date,turb_max$max)
colnames(turb_df) <- c("Date","max")
## prepare data for candlestick plot
prcp_summary <- turb_df %>%
group_by(Date) %>%
summarize(min = min(max),
q1 = quantile(max, 0.25),
median = median(max),
mean = mean(max),
q3 = quantile(max, 0.75),
max = max(max))
p2 <- plot_ly(prcp_summary,
x = ~Date,
y = ~mean,
type = 'scatter', mode = 'lines')
p2 <- p2 %>% layout(title = "Maximum turbidity in studied stream",
xaxis = list(rangeslider = list(visible = T)),
yaxis = list(title = 'Turbidity, NTU'))
p2Xylem stock
Figure 5 shows the plot of Xylem stock prices from 2011 to 2023. We can see the price has increased more than triple since 2014.
Code
## read data since 2011
xyl <- getSymbols("XYL",auto.assign = FALSE, from = "2011-10-14",src="yahoo")
xyl$date<-as.Date(xyl$date,"%Y-%m-%d")
chartSeries(xyl, theme = chartTheme("white"), # Theme
bar.type = "hlc", # High low close
up.col = "green", # Up candle color
dn.col = "red") # Down candle color)
xyl.close<- Ad(xyl)